/**   
 * 内容摘要：新闻采集工具类  
 * 流程说明：  
 * @author haozi
 * @return   
 */  
package com.wooophone.test;  
 
import java.util.regex.Matcher;  
import java.util.regex.Pattern;  
 
import org.apache.http.HttpEntity;  
import org.apache.http.HttpResponse;  
import org.apache.http.client.HttpClient;  
import org.apache.http.client.methods.HttpGet;  
import org.apache.http.impl.client.DefaultHttpClient;  
import org.apache.http.util.EntityUtils;  
import org.htmlparser.Node;  
import org.htmlparser.Parser;  
import org.htmlparser.tags.ScriptTag;  
import org.htmlparser.tags.StyleTag;  
import org.htmlparser.visitors.ObjectFindingVisitor;  
import org.htmlparser.visitors.TextExtractingVisitor;  
 
public class CrawNewsTools {  
 
    // 使用HttpClient组件读取指定URL的页面HTML源码  
    public static String getPage(String url, String encode) {  
        String page = "";  
        HttpClient httpClient = null;  
        try {  
            httpClient = new DefaultHttpClient();  
            // 创建httpget  
            HttpGet httpget = new HttpGet(url);  
            System.out.println("请求URI路径:" + httpget.getURI());  
            // 执行get请求  
            HttpResponse response = httpClient.execute(httpget);  
            // 获得响应实体  
            HttpEntity httpEntity = response.getEntity();  
            String charset = EntityUtils.getContentCharSet(httpEntity);  
            System.out.println("###当前页面编码:" + charset);  
            // 获取内容时，指定编码  
            if (encode != null && !encode.trim().equals("")) {  
                System.out.println("###采用指定编码:" + encode);  
            } else if (charset != null) {  
                System.out.println("###采用网页自身编码:" + charset);  
                encode = charset;  
            } else {  
                System.out.println("###采用默认UTF-8编码:UTF-8");  
                encode = "UTF-8";  
            }  
            page = EntityUtils.toString(httpEntity, encode);  
            page = removeCssTag(page, encode);  
            page = removeJsTag(page, encode);  
            page = getBody(page);  
            page = formatHtml(page);  
        } catch (Exception ex) {  
            ex.printStackTrace();  
        } finally {  
            httpClient.getConnectionManager().shutdown();  
        }  
        return page;  
    }  
 
    // 格式化body标签  
    public static String fromatBodyTag(String htmlCode) {  
        String result = htmlCode;  
        if (htmlCode != null) {  
            while (result.indexOf("<BODY") != -1) {  
                result.replaceAll("<BODY", "<body");  
            }  
            while (result.indexOf("<Body") != -1) {  
            	result = result.replaceAll("<Body", "<body");  
            }  
            while (result.indexOf("</BODY") != -1) {  
            	result = result.replaceAll("</BODY", "</body");  
            }  
            while (result.indexOf("</Body") != -1) {  
            	result = result.replaceAll("</Body", "</body");  
            }  
        }  
        return result;  
    }  
 
    // 使用正则表达式提取body体内容  
    public static String getBody(String htmlCode) {  
        if (htmlCode == null) {  
            return null;  
        }  
        Pattern pattern = Pattern.compile("<body(.*)>(.*)</body>",  
                Pattern.MULTILINE | Pattern.DOTALL);  
        Matcher matcher = pattern.matcher(htmlCode);  
        if (matcher.find()) {  
            return matcher.group();  
        } else {  
            return null;  
        }  
    }  
 
    // 过滤css标签  
    public static String removeCssTag(String htmlCode, String encode) {  
        String htmlEndCode = htmlCode;  
        try {  
            Parser parser = Parser.createParser(htmlCode, encode);  
            ObjectFindingVisitor visitor = new ObjectFindingVisitor(  
                    StyleTag.class);  
            parser.visitAllNodesWith(visitor);  
            Node[] nodes = visitor.getTags();  
            for (int i = 0; i < nodes.length; i++) {  
                // System.out.println(nodes[i].toHtml());  
                htmlEndCode = htmlEndCode.replace(nodes[i].toHtml(), "");  
            }  
            // System.out.println("###去除css标签后:" + htmlEndCode);  
        } catch (Exception e) {  
            e.printStackTrace();  
        }  
        return htmlEndCode;  
    }  
 
    // 过滤js标签  
    public static String removeJsTag(String htmlCode, String encode) {  
        String htmlEndCode = htmlCode;  
        try {  
            Parser parser = Parser.createParser(htmlCode, encode);  
            ObjectFindingVisitor visitor = new ObjectFindingVisitor(  
                    ScriptTag.class);  
            parser.visitAllNodesWith(visitor);  
            Node[] nodes = visitor.getTags();  
            for (int i = 0; i < nodes.length; i++) {  
                // System.out.println(nodes[i].toHtml());  
                htmlEndCode = htmlEndCode.replace(nodes[i].toHtml(), "");  
            }  
            // System.out.println("###去除js标签后:" + htmlEndCode);  
        } catch (Exception e) {  
            e.printStackTrace();  
        }  
        return htmlEndCode;  
    }  
 
    // 格式化指定的HTML源码  
    public static String formatHtml(String htmlcode) {  
        String result = htmlcode;  
        if (htmlcode != null && htmlcode.trim().length() > 0) {  
            // 去除回车符  
            while (result.indexOf("\r") != -1) {  
                result = result.replaceAll("\r", "");  
            }  
            // 去除换行符  
            while (result.indexOf("\n") != -1) {  
            	result = result.replaceAll("\n", "");  
            }  
            // 去除制表符  
            while (result.indexOf("\t") != -1) {  
            	result = result.replaceAll("\t", "");  
            }  
            // 去除多余空格  
            while (result.indexOf("  ") != -1) {  
            	result = result.replaceAll("  ", " ");  
            }  
            // 去除全角空格  
            while (result.indexOf("　") != -1) {  
            	result = result.replaceAll("　", "");  
            }  
            return result;  
        } else {  
            return null;  
        }  
    }  
 
    // 使用HtmlParser组件去除内容中的HTML标签，得到纯文本内容  
    public static String getText(String content, String encode) {  
        String result = content;  
        try {  
            Parser parser = Parser.createParser(content, encode);  
            // 创建TextExtractingVisitor对象  
            TextExtractingVisitor visitor = new TextExtractingVisitor();  
            // 去除网页中的所有标签,提出纯文本内容  
            parser.visitAllNodesWith(visitor);  
            result = visitor.getExtractedText();  
            // System.out.println("###去除HTML标签:" + result);  
        } catch (Exception ex) {  
            ex.printStackTrace();  
        }  
        return result;  
    }  
 
    // 查询网页包含某种标签的集合数组  
    @SuppressWarnings("unchecked")  
    public static Node[] getTagList(String htmlCode, String encode, Class t) {  
        Node[] nodes = null;  
        try {  
            Parser parser = Parser.createParser(htmlCode, encode);  
            ObjectFindingVisitor visitor = new ObjectFindingVisitor(t);  
            parser.visitAllNodesWith(visitor);  
            nodes = visitor.getTags();  
        } catch (Exception e) {  
            e.printStackTrace();  
        }  
        return nodes;  
    }  
 
    // 判断自定字符串是否符号某种正则规则  
    public static boolean isRegex(String isRegexString, String regexString) {  
        boolean regexStatus = Pattern.matches(regexString, isRegexString);  
        return regexStatus;  
    }  
}  